{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "rtox72csOQUN"
   },
   "source": [
    "# DeepMatch 样例代码\n",
    "- https://github.com/shenweichen/DeepMatch\n",
    "- https://deepmatch.readthedocs.io/en/latest/"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "bTWHz-heMkyw"
   },
   "source": [
    "# 下载movielens-1M数据 安装依赖包"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "yTl6d6jO1oqf",
    "outputId": "ca32c49d-102b-46e8-d613-1c33885326ff"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--2022-07-03 12:34:44--  http://files.grouplens.org/datasets/movielens/ml-1m.zip\n",
      "Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152\n",
      "Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 5917549 (5.6M) [application/zip]\n",
      "Saving to: ‘./ml-1m.zip’\n",
      "\n",
      "./ml-1m.zip         100%[===================>]   5.64M  19.1MB/s    in 0.3s    \n",
      "\n",
      "2022-07-03 12:34:44 (19.1 MB/s) - ‘./ml-1m.zip’ saved [5917549/5917549]\n",
      "\n",
      "--2022-07-03 12:34:44--  https://raw.githubusercontent.com/shenweichen/DeepMatch/dev/sample/examples/preprocess.py\n",
      "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.111.133, ...\n",
      "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 6705 (6.5K) [text/plain]\n",
      "Saving to: ‘preprocess.py’\n",
      "\n",
      "preprocess.py       100%[===================>]   6.55K  --.-KB/s    in 0s      \n",
      "\n",
      "2022-07-03 12:34:45 (57.6 MB/s) - ‘preprocess.py’ saved [6705/6705]\n",
      "\n",
      "Archive:  ml-1m.zip\n",
      "   creating: ml-1m/\n",
      "  inflating: ml-1m/movies.dat        \n",
      "  inflating: ml-1m/ratings.dat       \n",
      "  inflating: ml-1m/README            \n",
      "  inflating: ml-1m/users.dat         \n",
      "\u001b[K     |████████████████████████████████| 454.3 MB 16 kB/s \n",
      "\u001b[K     |████████████████████████████████| 14.8 MB 43.1 MB/s \n",
      "\u001b[K     |████████████████████████████████| 4.0 MB 52.7 MB/s \n",
      "\u001b[K     |████████████████████████████████| 132 kB 71.2 MB/s \n",
      "\u001b[K     |████████████████████████████████| 1.2 MB 61.6 MB/s \n",
      "\u001b[K     |████████████████████████████████| 4.0 MB 63.3 MB/s \n",
      "\u001b[K     |████████████████████████████████| 462 kB 54.4 MB/s \n",
      "\u001b[?25h  Building wheel for wrapt (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
      "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
      "kapre 0.3.7 requires tensorflow>=2.0.0, which is not installed.\n",
      "xarray-einstats 0.2.2 requires numpy>=1.21, but you have numpy 1.19.5 which is incompatible.\n",
      "deepctr 0.9.1 requires h5py==2.10.0, but you have h5py 3.1.0 which is incompatible.\n",
      "datascience 0.10.6 requires folium==0.2.1, but you have folium 0.8.3 which is incompatible.\n",
      "albumentations 0.1.12 requires imgaug<0.2.7,>=0.2.5, but you have imgaug 0.2.9 which is incompatible.\u001b[0m\n",
      "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
      "tensorflow-gpu 2.5.0 requires h5py~=3.1.0, but you have h5py 2.10.0 which is incompatible.\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "! wget http://files.grouplens.org/datasets/movielens/ml-1m.zip -O ./ml-1m.zip \n",
    "! wget https://raw.githubusercontent.com/shenweichen/DeepMatch/master/examples/preprocess.py -O preprocess.py\n",
    "! unzip -o ml-1m.zip \n",
    "! pip uninstall -y -q tensorflow\n",
    "! pip install -q tensorflow-gpu==2.5.0\n",
    "! pip install -q deepmatch"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "p9UxNHuPMuW2"
   },
   "source": [
    "# 导入需要的库"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "id": "C_ZR6gzp1E2N"
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from deepctr.feature_column import SparseFeat, VarLenSparseFeat\n",
    "from preprocess import gen_data_set, gen_model_input,gen_data_set_sdm,gen_model_input_sdm\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from tensorflow.python.keras import backend as K\n",
    "from tensorflow.python.keras.models import Model\n",
    "\n",
    "from deepmatch.models import *\n",
    "from deepmatch.utils import sampledsoftmaxloss, NegativeSampler"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "fQq6O9XAMzPF"
   },
   "source": [
    "# 读取数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "lcO29zFb21Od",
    "outputId": "cda19a71-6a6e-4113-f42d-ab80f06273b8"
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/swc/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:4: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n",
      "  after removing the cwd from sys.path.\n",
      "/Users/swc/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:6: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n",
      "  \n",
      "/Users/swc/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:8: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n",
      "  \n"
     ]
    }
   ],
   "source": [
    "data_path = \"./\"\n",
    "\n",
    "unames = ['user_id','gender','age','occupation','zip']\n",
    "user = pd.read_csv(data_path+'ml-1m/users.dat',sep='::',header=None,names=unames)\n",
    "rnames = ['user_id','movie_id','rating','timestamp']\n",
    "ratings = pd.read_csv(data_path+'ml-1m/ratings.dat',sep='::',header=None,names=rnames)\n",
    "mnames = ['movie_id','title','genres']\n",
    "movies = pd.read_csv(data_path+'ml-1m/movies.dat',sep='::',header=None,names=mnames,encoding=\"unicode_escape\")\n",
    "movies['genres'] = list(map(lambda x: x.split('|')[0], movies['genres'].values))\n",
    "\n",
    "data = pd.merge(pd.merge(ratings,movies),user)#.iloc[:10000]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "L0yCWxQxM3se"
   },
   "source": [
    "# 构建特征列，训练模型，导出embedding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 913
    },
    "id": "BMOvk_de2ML3",
    "outputId": "eba1ad5c-7a45-4b30-84f6-0d19f556834c"
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 6040/6040 [00:14<00:00, 402.76it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "10 10\n",
      "WARNING:tensorflow:From /Users/swc/study/DeepMatch/deepmatch/layers/sequence.py:35: BasicLSTMCell.__init__ (from tensorflow.python.keras.layers.legacy_rnn.rnn_cell_impl) is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.\n",
      "WARNING:tensorflow:From /Users/swc/study/DeepMatch/deepmatch/layers/sequence.py:65: MultiRNNCell.__init__ (from tensorflow.python.keras.layers.legacy_rnn.rnn_cell_impl) is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.\n",
      "WARNING:tensorflow:From /Users/swc/study/DeepMatch/deepmatch/layers/sequence.py:78: dynamic_rnn (from tensorflow.python.ops.rnn) is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "Please use `keras.layers.RNN(cell)`, which is equivalent to this API\n",
      "WARNING:tensorflow:From /Users/swc/anaconda3/lib/python3.6/site-packages/tensorflow/python/keras/layers/legacy_rnn/rnn_cell_impl.py:740: Layer.add_variable (from tensorflow.python.keras.engine.base_layer_v1) is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "Please use `layer.add_weight` method instead.\n",
      "WARNING:tensorflow:From /Users/swc/anaconda3/lib/python3.6/site-packages/tensorflow/python/keras/layers/legacy_rnn/rnn_cell_impl.py:744: calling Zeros.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
      "WARNING:tensorflow:From /Users/swc/anaconda3/lib/python3.6/site-packages/tensorflow/python/keras/initializers/initializers_v1.py:68: calling TruncatedNormal.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
      "WARNING:tensorflow:From /Users/swc/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py:158: calling LinearOperator.__init__ (from tensorflow.python.ops.linalg.linear_operator) with graph_parents is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "Do not pass `graph_parents`.  They will  no longer be used.\n",
      "Train on 988129 samples\n",
      "Epoch 1/20\n",
      "988129/988129 [==============================] - 114s 116us/sample - loss: 5.1889\n",
      "Epoch 2/20\n",
      "988129/988129 [==============================] - 123s 125us/sample - loss: 4.2234\n",
      "Epoch 3/20\n",
      "988129/988129 [==============================] - 121s 123us/sample - loss: 4.0599\n",
      "Epoch 4/20\n",
      "988129/988129 [==============================] - 130s 131us/sample - loss: 3.9574\n",
      "Epoch 5/20\n",
      "988129/988129 [==============================] - 124s 125us/sample - loss: 3.8822\n",
      "Epoch 6/20\n",
      "988129/988129 [==============================] - 121s 123us/sample - loss: 3.8313\n",
      "Epoch 7/20\n",
      "988129/988129 [==============================] - 112s 114us/sample - loss: 3.7889\n",
      "Epoch 8/20\n",
      "988129/988129 [==============================] - 102s 103us/sample - loss: 3.7559\n",
      "Epoch 9/20\n",
      "988129/988129 [==============================] - 102s 103us/sample - loss: 3.7315\n",
      "Epoch 10/20\n",
      "988129/988129 [==============================] - 101s 102us/sample - loss: 3.7082\n",
      "Epoch 11/20\n",
      "988129/988129 [==============================] - 114s 115us/sample - loss: 3.6901\n",
      "Epoch 12/20\n",
      "988129/988129 [==============================] - 109s 111us/sample - loss: 3.6750\n",
      "Epoch 13/20\n",
      "988129/988129 [==============================] - 108s 109us/sample - loss: 3.6606\n",
      "Epoch 14/20\n",
      "988129/988129 [==============================] - 105s 106us/sample - loss: 3.6482\n",
      "Epoch 15/20\n",
      "988129/988129 [==============================] - 119s 120us/sample - loss: 3.6363\n",
      "Epoch 16/20\n",
      "988129/988129 [==============================] - 119s 120us/sample - loss: 3.6288\n",
      "Epoch 17/20\n",
      "988129/988129 [==============================] - 120s 121us/sample - loss: 3.6193\n",
      "Epoch 18/20\n",
      "988129/988129 [==============================] - 122s 123us/sample - loss: 3.6123\n",
      "Epoch 19/20\n",
      "988129/988129 [==============================] - 122s 124us/sample - loss: 3.6049\n",
      "Epoch 20/20\n",
      "988129/988129 [==============================] - 117s 119us/sample - loss: 3.5990\n",
      "WARNING:tensorflow:From /Users/swc/anaconda3/lib/python3.6/site-packages/tensorflow/python/keras/engine/training_v1.py:2070: Model.state_updates (from tensorflow.python.keras.engine.training) is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "This property should not be used in TensorFlow 2.0, as updates are applied automatically.\n",
      "(6040, 32)\n",
      "(3706, 32)\n"
     ]
    }
   ],
   "source": [
    "#data = pd.read_csvdata = pd.read_csv(\"./movielens_sample.txt\")\n",
    "sparse_features = ['user_id','movie_id','gender', 'age', 'occupation', 'zip', 'genres']\n",
    "SEQ_LEN = 50\n",
    "SEQ_LEN_short = 5\n",
    "SEQ_LEN_prefer = 50\n",
    "negsample = 0\n",
    "\n",
    "# 1.Label Encoding for sparse features,and process sequence features with `gen_date_set` and `gen_model_input`\n",
    "\n",
    "feature_max_idx = {}\n",
    "for feature in sparse_features:\n",
    "    lbe = LabelEncoder()\n",
    "    data[feature] = lbe.fit_transform(data[feature]) + 1\n",
    "    feature_max_idx[feature] = data[feature].max() + 1\n",
    "features = sparse_features\n",
    "user_profile = data[features].drop_duplicates('user_id')\n",
    "\n",
    "item_profile = data[[\"movie_id\"]].drop_duplicates('movie_id')\n",
    "\n",
    "user_profile.set_index(\"user_id\", inplace=True)\n",
    "\n",
    "user_item_list = data.groupby(\"user_id\")['movie_id'].apply(list)\n",
    "\n",
    "train_set, test_set = gen_data_set_sdm(data, seq_short_max_len=SEQ_LEN_short, seq_prefer_max_len=SEQ_LEN_prefer)\n",
    "\n",
    "train_model_input, train_label = gen_model_input_sdm(train_set, user_profile, SEQ_LEN_short, SEQ_LEN_prefer)\n",
    "test_model_input, test_label = gen_model_input_sdm(test_set, user_profile, SEQ_LEN_short, SEQ_LEN_prefer)\n",
    "\n",
    "# 2.count #unique features for each sparse field and generate feature config for sequence feature\n",
    "\n",
    "embedding_dim = 32\n",
    "\n",
    "\n",
    "user_feature_columns = [SparseFeat('user_id', feature_max_idx['user_id'], 16),\n",
    "                        SparseFeat(\"gender\", feature_max_idx['gender'], 16),\n",
    "                        SparseFeat(\"age\", feature_max_idx['age'], 16),\n",
    "                        SparseFeat(\"occupation\", feature_max_idx['occupation'], 16),\n",
    "                        SparseFeat(\"zip\", feature_max_idx['zip'], 16),\n",
    "                        VarLenSparseFeat(SparseFeat('short_movie_id', feature_max_idx['movie_id'], embedding_dim,\n",
    "                                                    embedding_name=\"movie_id\"), SEQ_LEN_short, 'mean',\n",
    "                                         'short_sess_length'),\n",
    "                        VarLenSparseFeat(SparseFeat('prefer_movie_id', feature_max_idx['movie_id'], embedding_dim,\n",
    "                                                    embedding_name=\"movie_id\"), SEQ_LEN_prefer, 'mean',\n",
    "                                         'prefer_sess_length'),\n",
    "                        VarLenSparseFeat(SparseFeat('short_genres', feature_max_idx['genres'], embedding_dim,\n",
    "                                                    embedding_name=\"genres\"), SEQ_LEN_short, 'mean',\n",
    "                                         'short_sess_length'),\n",
    "                        VarLenSparseFeat(SparseFeat('prefer_genres', feature_max_idx['genres'], embedding_dim,\n",
    "                                                    embedding_name=\"genres\"), SEQ_LEN_prefer, 'mean',\n",
    "                                         'prefer_sess_length'),\n",
    "                        ]\n",
    "\n",
    "\n",
    "item_feature_columns = [SparseFeat('movie_id', feature_max_idx['movie_id'], embedding_dim)]\n",
    "\n",
    "from collections import Counter\n",
    "train_counter = Counter(train_model_input['movie_id'])\n",
    "item_count = [train_counter.get(i,0) for i in range(item_feature_columns[0].vocabulary_size)]\n",
    "sampler_config = NegativeSampler('frequency',num_sampled=255,item_name=\"movie_id\",item_count=item_count)\n",
    "\n",
    "# 3.Define Model and train\n",
    "\n",
    "import tensorflow as tf\n",
    "if tf.__version__ >= '2.0.0':\n",
    "    tf.compat.v1.disable_eager_execution()\n",
    "else:\n",
    "    K.set_learning_phase(True)\n",
    "\n",
    "model = SDM(user_feature_columns, item_feature_columns, history_feature_list=['movie_id','genres'],\n",
    "            units=embedding_dim, sampler_config=sampler_config )\n",
    "\n",
    "model.compile(optimizer=\"adam\", loss=sampledsoftmaxloss)\n",
    "\n",
    "history = model.fit(train_model_input, train_label,  # train_label,\n",
    "                    batch_size=512, epochs=20, verbose=1, validation_split=0.0, )\n",
    "\n",
    "# 4. Generate user features for testing and full item features for retrieval\n",
    "test_user_model_input = test_model_input\n",
    "all_item_model_input = {\"movie_id\": item_profile['movie_id'].values,}\n",
    "\n",
    "user_embedding_model = Model(inputs=model.user_input, outputs=model.user_embedding)\n",
    "item_embedding_model = Model(inputs=model.item_input, outputs=model.item_embedding)\n",
    "\n",
    "user_embs = user_embedding_model.predict(test_user_model_input, batch_size=2 ** 12)\n",
    "# user_embs = user_embs[:, i, :]  # i in [0,k_max) if MIND\n",
    "item_embs = item_embedding_model.predict(all_item_model_input, batch_size=2 ** 12)\n",
    "\n",
    "print(user_embs.shape)\n",
    "print(item_embs.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "w_G3KWslKmJo"
   },
   "source": [
    "# 使用faiss进行ANN查找并评估结果"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "5SvyQLNVKkcs"
   },
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "j2ZNYNBOOqrN"
   },
   "outputs": [],
   "source": [
    "! pip install faiss-cpu"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "id": "6TY1l27iJU8U"
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "6040it [00:01, 3381.75it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "recall 0.47516556291390727\n",
      "hit rate 0.47516556291390727\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "test_true_label = {line[0]:[line[1]] for line in test_set}\n",
    "\n",
    "import numpy as np\n",
    "import faiss\n",
    "from tqdm import tqdm\n",
    "from deepmatch.utils import recall_N\n",
    "\n",
    "index = faiss.IndexFlatIP(embedding_dim)\n",
    "# faiss.normalize_L2(item_embs)\n",
    "index.add(item_embs)\n",
    "# faiss.normalize_L2(user_embs)\n",
    "D, I = index.search(np.ascontiguousarray(user_embs), 50)\n",
    "s = []\n",
    "hit = 0\n",
    "for i, uid in tqdm(enumerate(test_user_model_input['user_id'])):\n",
    "    try:\n",
    "        pred = [item_profile['movie_id'].values[x] for x in I[i]]\n",
    "        filter_item = None\n",
    "        recall_score = recall_N(test_true_label[uid], pred, N=50)\n",
    "        s.append(recall_score)\n",
    "        if test_true_label[uid] in pred:\n",
    "            hit += 1\n",
    "    except:\n",
    "        print(i)\n",
    "print(\"\")\n",
    "print(\"recall\", np.mean(s))\n",
    "print(\"hit rate\", hit / len(test_user_model_input['user_id']))"
   ]
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "collapsed_sections": [],
   "name": "colab_MovieLen1M_SDM.ipynb",
   "provenance": [],
   "toc_visible": true
  },
  "gpuClass": "standard",
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
